<!DOCTYPE html>
<html lang="zh-cn">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  <title>09-Requests&#43;正则爬取猫眼电影top100 - vzvixb</title>
  <meta name="renderer" content="webkit" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>

<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />

<meta name="theme-color" content="#f8f5ec" />
<meta name="msapplication-navbutton-color" content="#f8f5ec">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="#f8f5ec">


<meta name="author" content="even" /><meta name="description" content="09.Requests&#43;正则爬取猫眼电影top100 目标网站分析 流程框架 抓取单页内容，(使用requests库得到html) 正则表达式分析，" /><meta name="keywords" content="Hugo, theme, even" />






<meta name="generator" content="Hugo 0.90.1 with theme even" />


<link rel="canonical" href="https://zhouxiaoxin.gitee.io/post/python/09.requests&#43;%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E7%8C%AB%E7%9C%BC%E7%94%B5%E5%BD%B1top100/" />
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
<link rel="manifest" href="/manifest.json">
<link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5">

<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

<link href="/sass/main.min.32d4dc642fec98c34c80bebb9c784c50771712b4a8a25d9f4dd9cce3534b426e.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.css" integrity="sha256-7TyXnr2YU040zfSP+rEcz29ggW4j56/ujTPwjMzyqFY=" crossorigin="anonymous">


<meta property="og:title" content="09-Requests&#43;正则爬取猫眼电影top100" />
<meta property="og:description" content="09.Requests&#43;正则爬取猫眼电影top100 目标网站分析 流程框架 抓取单页内容，(使用requests库得到html) 正则表达式分析，" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://zhouxiaoxin.gitee.io/post/python/09.requests&#43;%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E7%8C%AB%E7%9C%BC%E7%94%B5%E5%BD%B1top100/" /><meta property="article:section" content="post" />
<meta property="article:published_time" content="2018-09-08T16:00:56+08:00" />
<meta property="article:modified_time" content="2018-09-08T16:00:56+08:00" />

<meta itemprop="name" content="09-Requests&#43;正则爬取猫眼电影top100">
<meta itemprop="description" content="09.Requests&#43;正则爬取猫眼电影top100 目标网站分析 流程框架 抓取单页内容，(使用requests库得到html) 正则表达式分析，"><meta itemprop="datePublished" content="2018-09-08T16:00:56+08:00" />
<meta itemprop="dateModified" content="2018-09-08T16:00:56+08:00" />
<meta itemprop="wordCount" content="401">
<meta itemprop="keywords" content="爬虫,Python," /><meta name="twitter:card" content="summary"/>
<meta name="twitter:title" content="09-Requests&#43;正则爬取猫眼电影top100"/>
<meta name="twitter:description" content="09.Requests&#43;正则爬取猫眼电影top100 目标网站分析 流程框架 抓取单页内容，(使用requests库得到html) 正则表达式分析，"/>

<!--[if lte IE 9]>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/classlist/1.1.20170427/classList.min.js"></script>
<![endif]-->

<!--[if lt IE 9]>
  <script src="https://cdn.jsdelivr.net/npm/html5shiv@3.7.3/dist/html5shiv.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/respond.js@1.4.2/dest/respond.min.js"></script>
<![endif]-->

</head>
<body>
  <div id="mobile-navbar" class="mobile-navbar">
  <div class="mobile-header-logo">
    <a href="/" class="logo">Even</a>
  </div>
  <div class="mobile-navbar-icon">
    <span></span>
    <span></span>
    <span></span>
  </div>
</div>
<nav id="mobile-menu" class="mobile-menu slideout-menu">
  <ul class="mobile-menu-list">
    <a href="/">
        <li class="mobile-menu-item">Home</li>
      </a><a href="/post/">
        <li class="mobile-menu-item">Archs</li>
      </a><a href="/tags/">
        <li class="mobile-menu-item">Tags</li>
      </a><a href="/categories/">
        <li class="mobile-menu-item">Cates</li>
      </a><a href="/about/">
        <li class="mobile-menu-item">About</li>
      </a><a href="/pages/runoob/">
        <li class="mobile-menu-item">runoob</li>
      </a><a href="/pages/98wubi/">
        <li class="mobile-menu-item">98wubi</li>
      </a>
  </ul>
</nav>
  <div class="container" id="mobile-panel">
    <header id="header" class="header">
        <div class="logo-wrapper">
  <a href="/" class="logo">Even</a>
</div>

<nav class="site-navbar">
  <ul id="menu" class="menu">
    <li class="menu-item">
        <a class="menu-item-link" href="/">Home</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/post/">Archs</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/tags/">Tags</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/categories/">Cates</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/about/">About</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/pages/runoob/">runoob</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/pages/98wubi/">98wubi</a>
      </li>
  </ul>
</nav>
    </header>

    <main id="main" class="main">
      <div class="content-wrapper">
        <div id="content" class="content">
          <article class="post">
    
    <header class="post-header">
      <h1 class="post-title">09-Requests&#43;正则爬取猫眼电影top100</h1>

      <div class="post-meta">
        <span class="post-time"> 2018-09-08 </span>
        <div class="post-category">
            <a href="/categories/python/"> Python </a>
            </div>
          <span class="more-meta"> 约 401 字 </span>
          <span class="more-meta"> 预计阅读 1 分钟 </span>
        <span id="busuanzi_container_page_pv" class="more-meta"> <span id="busuanzi_value_page_pv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 次阅读 </span>
      </div>
    </header>

    <div class="post-toc" id="post-toc">
  <h2 class="post-toc-title">文章目录</h2>
  <div class="post-toc-content">
    <nav id="TableOfContents">
  <ul>
    <li><a href="#09requests正则爬取猫眼电影top100">09.Requests+正则爬取猫眼电影top100</a></li>
  </ul>
</nav>
  </div>
</div>
    <div class="post-content">
      <h2 id="09requests正则爬取猫眼电影top100">09.Requests+正则爬取猫眼电影top100</h2>
<ol>
<li>目标网站分析</li>
<li>流程框架</li>
</ol>
<ul>
<li>抓取单页内容，(使用requests库得到html)</li>
<li>正则表达式分析，提取出有用的信息</li>
<li>保存至文件(保存到本地json文件)</li>
<li>开启循环及多线程，对页面进行遍历开启多线程提高速度</li>
</ul>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre tabindex="0" class="chroma"><code><span class="lnt"> 1
</span><span class="lnt"> 2
</span><span class="lnt"> 3
</span><span class="lnt"> 4
</span><span class="lnt"> 5
</span><span class="lnt"> 6
</span><span class="lnt"> 7
</span><span class="lnt"> 8
</span><span class="lnt"> 9
</span><span class="lnt">10
</span><span class="lnt">11
</span><span class="lnt">12
</span><span class="lnt">13
</span><span class="lnt">14
</span><span class="lnt">15
</span><span class="lnt">16
</span><span class="lnt">17
</span><span class="lnt">18
</span><span class="lnt">19
</span><span class="lnt">20
</span><span class="lnt">21
</span><span class="lnt">22
</span><span class="lnt">23
</span><span class="lnt">24
</span><span class="lnt">25
</span><span class="lnt">26
</span><span class="lnt">27
</span><span class="lnt">28
</span><span class="lnt">29
</span><span class="lnt">30
</span><span class="lnt">31
</span><span class="lnt">32
</span><span class="lnt">33
</span><span class="lnt">34
</span><span class="lnt">35
</span><span class="lnt">36
</span><span class="lnt">37
</span><span class="lnt">38
</span><span class="lnt">39
</span><span class="lnt">40
</span><span class="lnt">41
</span><span class="lnt">42
</span><span class="lnt">43
</span><span class="lnt">44
</span><span class="lnt">45
</span><span class="lnt">46
</span><span class="lnt">47
</span><span class="lnt">48
</span><span class="lnt">49
</span><span class="lnt">50
</span></code></pre></td>
<td class="lntd">
<pre tabindex="0" class="chroma"><code class="language-python" data-lang="python"><span class="kn">import</span> <span class="nn">requests</span><span class="o">,</span><span class="nn">re</span><span class="o">,</span><span class="nn">json</span><span class="o">,</span><span class="nn">time</span>
<span class="kn">from</span> <span class="nn">requests.exceptions</span> <span class="kn">import</span> <span class="n">RequestException</span>
<span class="kn">from</span> <span class="nn">multiprocessing</span> <span class="kn">import</span> <span class="n">Pool</span>
<span class="k">def</span> <span class="nf">get_one_page</span><span class="p">(</span><span class="n">url</span><span class="p">):</span> <span class="c1"># </span>
   <span class="k">try</span><span class="p">:</span>
       <span class="n">headers</span> <span class="o">=</span> <span class="p">{</span>
           <span class="s1">&#39;User-Agent&#39;</span><span class="p">:</span> <span class="s1">&#39;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36&#39;</span>
       <span class="p">}</span>
       <span class="n">response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">url</span><span class="o">=</span><span class="n">url</span><span class="p">,</span> <span class="n">headers</span><span class="o">=</span><span class="n">headers</span><span class="p">)</span>
       <span class="k">if</span> <span class="n">response</span><span class="o">.</span><span class="n">status_code</span> <span class="o">==</span> <span class="mi">200</span><span class="p">:</span>
           <span class="k">return</span> <span class="n">response</span><span class="o">.</span><span class="n">text</span>
       <span class="k">return</span> <span class="kc">None</span>
   <span class="k">except</span> <span class="n">RequestException</span><span class="p">:</span>
       <span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">parse_on_page</span><span class="p">(</span><span class="n">html</span><span class="p">):</span>

   <span class="n">pattern</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s1">&#39;&lt;dd&gt;.*?board-index.*?&gt;(\d+)&lt;/i&gt;.*?poster-default&#34;.*?data-src=&#34;(.*?)&#34;.*?name&#34;&gt;.*?&lt;a&#39;</span>
   <span class="o">+</span><span class="s1">&#39;.*?&gt;(.*?)&lt;/a&gt;.*?star&#34;&gt;(.*?)&lt;/p&gt;.*?releasetime&#34;&gt;(.*?)&lt;/p&gt;.*?integer&#34;&gt;(.*?)&lt;/i&gt;.*?fraction&#34;&gt;(.*?)&lt;/i&gt;.*?&lt;/dd&gt;&#39;</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">S</span><span class="p">)</span>
   <span class="n">items</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="n">pattern</span><span class="p">,</span> <span class="n">html</span><span class="p">)</span>
   <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">items</span><span class="p">:</span>
       <span class="k">yield</span> <span class="p">{</span>
           <span class="s1">&#39;index&#39;</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span>
           <span class="s1">&#39;image&#39;</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span>
           <span class="s1">&#39;title&#39;</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span>
           <span class="s1">&#39;actor&#39;</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">strip</span><span class="p">()[</span><span class="mi">3</span><span class="p">:],</span>
           <span class="s1">&#39;item&#39;</span><span class="p">:</span> <span class="n">item</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">strip</span><span class="p">()[</span><span class="mi">5</span><span class="p">:],</span>
           <span class="s1">&#39;score&#39;</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="n">item</span><span class="p">[</span><span class="mi">5</span><span class="p">])</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">item</span><span class="p">[</span><span class="mi">6</span><span class="p">])</span>
       <span class="p">}</span>
<span class="k">def</span> <span class="nf">write_to_file</span><span class="p">(</span><span class="n">content</span><span class="p">):</span>
   <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;maoyan.json&#39;</span><span class="p">,</span> <span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
       <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="n">ensure_ascii</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">main</span><span class="p">(</span><span class="n">num</span><span class="p">):</span>
   <span class="n">url</span> <span class="o">=</span> <span class="s1">&#39;http://maoyan.com/board/4?offset=&#39;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">num</span><span class="p">)</span>
   <span class="n">html</span> <span class="o">=</span> <span class="n">get_one_page</span><span class="p">(</span><span class="n">url</span><span class="p">)</span>
   <span class="n">items</span> <span class="o">=</span> <span class="n">parse_on_page</span><span class="p">(</span><span class="n">html</span><span class="p">)</span>
   <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">items</span><span class="p">:</span>
       <span class="n">write_to_file</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
   <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;maoyan.json&#39;</span><span class="p">,</span> <span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
       <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\n\n\n</span><span class="s1">&#39;</span><span class="p">)</span>
   <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;写入--- </span><span class="si">%s</span><span class="s1">页 ----完成！&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">num</span><span class="o">/</span><span class="mi">10</span><span class="o">+</span><span class="mi">1</span><span class="p">)))</span>
   
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
   <span class="n">page</span> <span class="o">=</span> <span class="nb">input</span><span class="p">(</span><span class="s1">&#39;请输入您要获取猫眼电影的页数：&#39;</span><span class="p">)</span>
   <span class="n">page</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
   <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="n">page</span><span class="p">):</span>
       <span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
       <span class="n">main</span><span class="p">(</span><span class="n">i</span><span class="o">*</span><span class="mi">10</span><span class="p">)</span>
   <span class="c1"># 多进程</span>
   <span class="c1"># pool = Pool() # 如果没有就创建一个进程池</span>
   <span class="c1"># pool.map(main, [i*10 for i in range(page)]) #map循环</span>
</code></pre></td></tr></table>
</div>
</div>
    </div>

    <div class="post-copyright">
  <p class="copyright-item">
    <span class="item-title">文章作者</span>
    <span class="item-content">even</span>
  </p>
  <p class="copyright-item">
    <span class="item-title">上次更新</span>
    <span class="item-content">
        2018-09-08
        
    </span>
  </p>
  
  
</div>
<footer class="post-footer">
      <div class="post-tags">
          <a href="/tags/%E7%88%AC%E8%99%AB/">爬虫</a>
          <a href="/tags/python/">Python</a>
          </div>
      <nav class="post-nav">
        <a class="prev" href="/post/python/08.selenium%E5%BA%93%E8%AF%A6%E8%A7%A3/">
            <i class="iconfont icon-left"></i>
            <span class="prev-text nav-default">08-Selenium库详解</span>
            <span class="prev-text nav-mobile">上一篇</span>
          </a>
        <a class="next" href="/post/python/10.%E5%88%86%E6%9E%90ajax%E8%AF%B7%E6%B1%82%E5%B9%B6%E6%8A%93%E5%8F%96%E4%BB%8A%E6%97%A5%E5%A4%B4%E6%9D%A1%E8%A1%97%E6%8B%8D%E7%BE%8E%E5%9B%BE-/">
            <span class="next-text nav-default">10-分析Ajax请求并抓取今日头条街拍美图 </span>
            <span class="next-text nav-mobile">下一篇</span>
            <i class="iconfont icon-right"></i>
          </a>
      </nav>
    </footer>
  </article>
        </div>
        

  

  

      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="social-links">
      <a href="mailto:your@email.com" class="iconfont icon-email" title="email"></a>
      <a href="http://localhost:1313" class="iconfont icon-stack-overflow" title="stack-overflow"></a>
      <a href="http://localhost:1313" class="iconfont icon-twitter" title="twitter"></a>
      <a href="http://localhost:1313" class="iconfont icon-facebook" title="facebook"></a>
      <a href="http://localhost:1313" class="iconfont icon-linkedin" title="linkedin"></a>
      <a href="http://localhost:1313" class="iconfont icon-google" title="google"></a>
      <a href="http://localhost:1313" class="iconfont icon-github" title="github"></a>
      <a href="http://localhost:1313" class="iconfont icon-weibo" title="weibo"></a>
      <a href="http://localhost:1313" class="iconfont icon-zhihu" title="zhihu"></a>
      <a href="http://localhost:1313" class="iconfont icon-douban" title="douban"></a>
      <a href="http://localhost:1313" class="iconfont icon-pocket" title="pocket"></a>
      <a href="http://localhost:1313" class="iconfont icon-tumblr" title="tumblr"></a>
      <a href="http://localhost:1313" class="iconfont icon-instagram" title="instagram"></a>
      <a href="http://localhost:1313" class="iconfont icon-gitlab" title="gitlab"></a>
      <a href="http://localhost:1313" class="iconfont icon-bilibili" title="bilibili"></a>
  <a href="https://zhouxiaoxin.gitee.io/index.xml" type="application/rss+xml" class="iconfont icon-rss" title="rss"></a>
</div>

<div class="copyright">
  <span class="power-by">
    由 <a class="hexo-link" href="https://gohugo.io">Hugo</a> 强力驱动
  </span>
  <span class="division">|</span>
  <span class="theme-info">
    主题 - 
    <a class="theme-link" href="https://github.com/olOwOlo/hugo-theme-even">Even</a>
  </span>

  <div class="busuanzi-footer">
    <span id="busuanzi_container_site_pv"> 本站总访问量 <span id="busuanzi_value_site_pv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 次 </span>
      <span class="division">|</span>
    <span id="busuanzi_container_site_uv"> 本站总访客数 <span id="busuanzi_value_site_uv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 人 </span>
  </div>

  <span class="copyright-year">
    &copy; 
    2018 - 
    2022
    <span class="heart">
      <i class="iconfont icon-heart"></i>
    </span>
    <span class="author">even</span>
  </span>
</div>
    </footer>

    <div class="back-to-top" id="back-to-top">
      <i class="iconfont icon-up"></i>
    </div>
  </div>
  
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.2.1/dist/jquery.min.js" integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/slideout@1.0.1/dist/slideout.min.js" integrity="sha256-t+zJ/g8/KXIJMjSVQdnibt4dlaDxc9zXr/9oNPeWqdg=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.js" integrity="sha256-XVLffZaxoWfGUEbdzuLi7pwaUJv1cecsQJQqGLe7axY=" crossorigin="anonymous"></script>



<script type="text/javascript" src="/js/main.min.2517c0eb67172a0bae917de4af59b10ca2531411a009d4c0b82f5685259e5771.js"></script>








</body>
</html>
